In [1]:
%matplotlib inline

In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from altair import Chart

In [3]:
!ls -lah ../data/*csv


-rw-rw-r-- 1 ilya ilya 457K Oct 26 10:56 ../data/dfa_mp.offset_150.win_100.csv
-rw-rw-r-- 1 ilya ilya 461K Oct 26 10:56 ../data/dfa_mp.offset_150.win_200.csv
-rw-rw-r-- 1 ilya ilya 453K Oct 26 10:56 ../data/dfa_mp.offset_150.win_50.csv
-rw-rw-r-- 1 ilya ilya 455K Oct 26 10:56 ../data/dfa_mp.offset_150.win_80.csv
-rw-rw-r-- 1 ilya ilya 457K Oct 26 10:56 ../data/dfa_mp.offset_200.win_100.csv
-rw-rw-r-- 1 ilya ilya 461K Oct 26 10:56 ../data/dfa_mp.offset_200.win_200.csv
-rw-rw-r-- 1 ilya ilya 454K Oct 26 10:56 ../data/dfa_mp.offset_200.win_50.csv
-rw-rw-r-- 1 ilya ilya 456K Oct 26 10:56 ../data/dfa_mp.offset_200.win_80.csv
-rw-rw-r-- 1 ilya ilya 457K Oct 26 10:56 ../data/dfa_mp.offset_300.win_100.csv
-rw-rw-r-- 1 ilya ilya 461K Oct 26 10:56 ../data/dfa_mp.offset_300.win_200.csv
-rw-rw-r-- 1 ilya ilya 454K Oct 26 10:56 ../data/dfa_mp.offset_300.win_50.csv
-rw-rw-r-- 1 ilya ilya 455K Oct 26 10:56 ../data/dfa_mp.offset_300.win_80.csv
-rw-rw-r-- 1 ilya ilya 603K Jul 16 11:19 ../data/dfm.csv

Load the data


In [4]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../data/dfa_mp.offset_{}.win_{}.csv'

output = []

for offset in offsets:
    for winsize in winsizes:
        df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
        df['win'] = winsize
        df['offset'] = offset
        output.append(df)
        
dfa = pd.concat(output)

In [5]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa


Out[5]:
TSS end_x start_x gene strand_x end_y start_y strand_y strand ratio_ATCACG ratio_ACAGTG ratio_CGATGT ratio_GCCAAT win offset UTR_length
0 148 190 148 thrL + 255.0 190.0 + + 3.000000 2.784355 0.911828 3.178117 50 150 42
1 148 190 148 thrL + 255.0 190.0 + + 3.000000 2.784355 0.911828 3.178117 50 150 42
2 5030 5234 5030 yaaX + 5530.0 5234.0 + + 4.576923 6.983333 1.264901 1.436242 50 150 204
3 6587 6587 6459 yaaA - 6459.0 5683.0 - - 0.032028 0.072193 0.567568 0.600000 50 150 128
4 6615 6615 6459 yaaA - 6459.0 5683.0 - - 0.034091 0.090379 0.654135 0.582011 50 150 156
5 8017 8017 7959 yaaJ - 7959.0 6529.0 - - 0.875000 0.571429 0.885246 1.196262 50 150 58
6 8191 8238 8191 talB + 9191.0 8238.0 + + 0.478825 0.513356 0.473950 0.564393 50 150 47
9 11542 11542 11356 yaaW - 11356.0 10643.0 - - 0.666667 1.777778 1.327273 1.012658 50 150 186
10 11825 11825 11786 yaaI - 11786.0 11382.0 - - 0.500000 2.625000 0.652330 0.474874 50 150 39
11 11913 11913 11786 yaaI - 11786.0 11382.0 - - 0.333333 0.555556 1.748148 1.713376 50 150 127
12 11938 11938 11786 yaaI - 11786.0 11382.0 - - 0.857143 0.428571 1.100592 1.442623 50 150 152
13 12048 12163 12048 dnaK + 14079.0 12163.0 + + 0.252212 0.207481 0.171599 0.301158 50 150 115
14 12123 12163 12123 dnaK + 14079.0 12163.0 + + 0.869191 0.539653 0.430504 1.010352 50 150 40
15 12144 12163 12144 dnaK + 14079.0 12163.0 + + 0.979294 0.717621 0.513948 1.066012 50 150 19
18 16951 16951 16903 hokC - 16903.0 16751.0 - - 0.478261 0.569767 0.599631 0.459902 50 150 48
19 17317 17489 17317 nhaA + 18655.0 17489.0 + + 0.052632 0.126904 2.822222 1.647166 50 150 172
20 17458 17489 17458 nhaA + 18655.0 17489.0 + + 1.067073 1.989583 0.762238 1.602339 50 150 31
21 21120 21120 21078 rpsT - 21078.0 20815.0 - - 0.752518 0.615503 0.493768 0.752228 50 150 42
22 21210 21210 21078 rpsT - 21078.0 20815.0 - - 0.278619 0.579581 0.220507 0.358928 50 150 132
23 21383 21407 21383 ribF + 22348.0 21407.0 + + 0.922207 1.056693 0.849432 0.966921 50 150 24
24 21833 22391 21833 ileS + 25207.0 22391.0 + + 1.352113 1.040936 1.098859 1.163180 50 150 558
25 22034 22391 22034 ileS + 25207.0 22391.0 + + 0.528970 0.743542 0.934363 0.388699 50 150 357
26 22229 22391 22229 ileS + 25207.0 22391.0 + + 0.418221 0.240061 0.299776 0.510862 50 150 162
27 25014 25207 25014 lspA + 25701.0 25207.0 + + 0.850227 0.498730 0.592040 0.854137 50 150 193
28 28288 28374 28288 dapB + 29195.0 28374.0 + + 0.544828 1.341176 0.757576 0.496063 50 150 86
29 28343 28374 28343 dapB + 29195.0 28374.0 + + 1.752809 1.933333 1.785714 1.243902 50 150 31
30 29551 29651 29551 carA + 30799.0 29651.0 + + 0.790000 0.430233 0.240310 0.424419 50 150 100
31 29619 29651 29619 carA + 30799.0 29651.0 + + 0.788462 0.435897 0.461957 0.466912 50 150 32
32 30775 30817 30775 carB + 34038.0 30817.0 + + 0.513514 0.761194 0.406593 1.136000 50 150 42
33 34218 34300 34218 caiF + 34695.0 34300.0 + + 0.764706 1.388889 0.357143 0.403846 50 150 82
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3754 4609344 4609414 4609344 prfC + 4611003.0 4609414.0 + + 1.043222 0.723374 0.815589 1.112554 200 300 70
3755 4609356 4609414 4609356 prfC + 4611003.0 4609414.0 + + 1.113715 0.751312 0.856031 1.154138 200 300 58
3756 4611153 4611396 4611153 osmY + 4612001.0 4611396.0 + + 1.070175 1.486726 0.866915 0.928707 200 300 243
3757 4616679 4617323 4616679 deoC + 4618102.0 4617323.0 + + 1.140625 0.526882 1.102041 0.934307 200 300 644
3758 4617278 4617323 4617278 deoC + 4618102.0 4617323.0 + + 1.826840 2.649815 1.678423 2.319249 200 300 45
3759 4619567 4619603 4619567 deoB + 4620826.0 4619603.0 + + 0.520743 0.548993 0.379441 0.564190 200 300 36
3760 4621657 4621769 4621657 yjjJ + 4623100.0 4621769.0 + + 3.920833 14.337209 1.594747 1.465487 200 300 112
3761 4621716 4621769 4621716 yjjJ + 4623100.0 4621769.0 + + 1.156682 2.305970 0.783037 0.875229 200 300 53
3762 4624238 4624238 4624117 lplA - 4624117.0 4623101.0 - - 2.214286 1.905263 1.272436 1.488294 200 300 121
3763 4624799 4624799 4624789 ytjB - 4624789.0 4624145.0 - - 1.145985 1.015267 0.684332 0.680734 200 300 10
3764 4624856 4624895 4624856 serB + 4625863.0 4624895.0 + + 1.471910 2.332288 1.002410 1.592798 200 300 39
3765 4630566 4630566 4630522 yjjK - 4630522.0 4628855.0 - - 0.975590 1.301775 0.613567 0.892770 200 300 44
3766 4630700 4630733 4630700 slt + 4632670.0 4630733.0 + + 0.843023 0.823171 0.951872 0.894191 200 300 33
3767 4632704 4632760 4632704 trpR + 4633086.0 4632760.0 + + 1.302372 1.444231 0.835112 0.975930 200 300 56
3768 4633773 4633773 4633745 yjjX - 4633745.0 4633233.0 - - 3.361868 3.631399 1.012745 1.262749 200 300 28
3769 4633899 4633899 4633745 yjjX - 4633745.0 4633233.0 - - 3.716738 5.043478 1.026196 1.508820 200 300 154
3770 4635243 4635521 4635243 creA + 4635994.0 4635521.0 + + 2.986159 2.272251 1.123288 1.083676 200 300 278
3771 4635353 4635353 4635310 rob - 4635310.0 4634441.0 - - 0.902421 0.560804 0.475946 0.853982 200 300 43
3772 4635477 4635521 4635477 creA + 4635994.0 4635521.0 + + 0.989286 1.091716 0.517182 0.904128 200 300 44
3773 4638160 4638178 4638160 creD + 4639530.0 4638178.0 + + 1.642857 3.800000 1.421053 0.748068 200 300 18
3774 4640358 4640402 4640358 yjjY + 4640542.0 4640402.0 + + 13.010830 11.512545 14.250000 7.067416 200 300 44
3775 4640508 4640508 4640306 arcA - 4640306.0 4639590.0 - - 1.163365 0.827256 0.823056 1.180585 200 300 202
3776 4640512 4640512 4640306 arcA - 4640306.0 4639590.0 - - 1.167142 0.837495 0.831858 1.187599 200 300 206
3777 4640535 4640535 4640306 arcA - 4640306.0 4639590.0 - - 1.057403 0.763457 0.804410 1.089905 200 300 229
3778 4640599 4640599 4640306 arcA - 4640306.0 4639590.0 - - 0.867294 0.786859 0.757342 0.862293 200 300 293
3779 4640681 4640681 4640306 arcA - 4640306.0 4639590.0 - - 0.542907 0.452288 0.399267 0.477665 200 300 375
3780 4640688 4640688 4640306 arcA - 4640306.0 4639590.0 - - 0.515849 0.440549 0.386567 0.455797 200 300 382
3781 4640801 4640801 4640306 arcA - 4640306.0 4639590.0 - - 0.089461 0.110785 0.126010 0.168638 200 300 495
3782 4640838 4640942 4640838 yjtD + 4641628.0 4640942.0 + + 1.639535 0.945946 1.095890 2.051546 200 300 104
3783 4640898 4640942 4640898 yjtD + 4641628.0 4640942.0 + + 1.056604 0.763636 0.952381 1.453782 200 300 44

43812 rows × 16 columns

Let's see how it looks like for one particular window and offset value


In [6]:
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['ratio_ATCACG'] > 2)
        & (dfa['offset'] == 200)
        & (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])

In [7]:
d.shape


Out[7]:
(292, 6)

In [18]:
d


Out[18]:
UTR_length ratio_ATCACG ratio_CGATGT log-bcm log+bcm loglen
2 204 4.275862 0.820000 0.631024 -0.086186 2.309630
39 215 2.333333 3.521127 0.367977 0.546682 2.332438
84 156 6.272727 1.447257 0.797456 0.160546 2.193125
88 99 2.810026 0.772128 0.448710 -0.112311 1.995635
89 120 2.761506 0.675000 0.441146 -0.170696 2.079181
95 96 2.233914 0.950076 0.349066 -0.022242 1.982271
153 186 63.143939 34.624434 1.800332 1.539383 2.269513
154 183 63.918919 34.704545 1.805629 1.540386 2.262451
155 110 70.729064 37.570681 1.849598 1.574849 2.041393
159 122 4.146018 2.969582 0.617631 0.472695 2.086360
160 116 4.328704 3.075099 0.636358 0.487859 2.064458
162 902 2.199029 2.207650 0.342231 0.343930 2.955207
172 132 61.875000 4.588957 1.791515 0.661714 2.120574
176 286 13.245690 7.841642 1.122075 0.894407 2.456366
177 178 109.071429 67.159292 2.037711 1.827106 2.250420
245 93 2.200000 2.991379 0.342423 0.475871 1.968483
258 165 2.750000 0.790055 0.439333 -0.102343 2.217484
276 186 2.500000 0.694690 0.397940 -0.158209 2.269513
277 152 87.200000 3.755853 1.940516 0.574709 2.181844
307 122 3.666667 0.669231 0.564271 -0.174424 2.086360
320 214 4.833333 0.809160 0.684247 -0.091965 2.330414
340 82 4.195616 1.570605 0.622796 0.196067 1.913814
347 183 59.916667 5.461538 1.777548 0.737315 2.262451
356 82 2.319484 0.917614 0.365391 -0.037340 1.913814
357 139 2.818035 0.998932 0.449946 -0.000464 2.143015
358 141 2.786164 0.989362 0.445007 -0.004645 2.149219
379 185 19.782609 5.887500 1.296284 0.769931 2.267172
418 182 2.278846 0.535211 0.357715 -0.271475 2.260071
419 184 2.368932 0.561594 0.374553 -0.250577 2.264818
420 204 3.252747 0.647510 0.512250 -0.188754 2.309630
... ... ... ... ... ... ...
3423 116 2.343750 0.554545 0.369911 -0.256063 2.064458
3431 292 6.928065 5.278667 0.840612 0.722524 2.465383
3432 292 6.928065 5.278667 0.840612 0.722524 2.465383
3433 175 155.278912 69.444444 2.191112 1.841638 2.243038
3434 175 155.278912 69.444444 2.191112 1.841638 2.243038
3443 189 6.058824 2.487603 0.782388 0.395781 2.276462
3444 184 5.956522 2.403226 0.774993 0.380795 2.264818
3454 284 9.672504 6.210826 0.985539 0.793149 2.453318
3455 176 145.689394 81.254717 2.163428 1.909849 2.245513
3456 114 6.597561 1.768349 0.819383 0.247568 2.056905
3463 142 26.000000 1.101124 1.414973 0.041836 2.152288
3464 204 19.000000 1.244444 1.278754 0.094976 2.309630
3465 323 38.076923 22.057143 1.580662 1.343549 2.509203
3471 306 3.022222 0.924901 0.480326 -0.033905 2.485721
3483 180 2.803448 0.891986 0.447693 -0.049642 2.255273
3540 95 3.777419 2.308852 0.577195 0.363396 1.977724
3550 531 2.173333 1.377451 0.337126 0.139076 2.725095
3552 194 4.000000 4.494253 0.602060 0.652658 2.287802
3584 94 8.224719 1.580838 0.915121 0.198887 1.973128
3602 146 3.232323 2.173258 0.509515 0.337111 2.164353
3648 101 2.018182 0.549451 0.304960 -0.260071 2.004321
3661 164 21.000000 1.043478 1.322219 0.018483 2.214844
3702 427 4.000000 1.863158 0.602060 0.270250 2.630428
3703 290 2.538462 0.526718 0.404571 -0.278422 2.462398
3706 421 5.000000 0.629630 0.698970 -0.200915 2.624282
3726 130 3.257576 1.777592 0.512895 0.249832 2.113943
3749 108 28.709748 4.761766 1.458029 0.677768 2.033424
3760 112 4.765432 1.843658 0.678102 0.265680 2.049218
3770 278 5.253165 3.691667 0.720421 0.567222 2.444045
3782 104 3.416667 1.170732 0.533603 0.068457 2.017033

292 rows × 6 columns


In [28]:
from copy import deepcopy
import statsmodels.api as sm
import altair

def linear_regression(x, y):
    p = np.polyfit(x, y, 1)
    return np.polyval(p, x)

def lowess(x, y):
    return sm.nonparametric.lowess(y, x, frac=1/7, return_sorted=False)

def rmean(x, y):
    win = y.shape[0] // 20
    return y.rolling(center=True, window=win).mean()

class RegressionChart(altair.Chart):
    @staticmethod
    def _add_regression_column(group, regression_func, x, y, yfit):
        group[yfit] = regression_func(group[x], group[y])
        return group
    
    def regression_plot(self, func=linear_regression, **kwargs):
        if not isinstance(self.data, pd.DataFrame):
            raise ValueError("data must be a DataFrame")
            
        points = self.mark_point()
        lines = deepcopy(self).mark_line()
        
        encoding = points.encoding.to_dict()
        if any(enc.get('bin', False) for enc in encoding.values()):
            raise ValueError("regress() cannot handle binned variables")
            
        group_cols = [enc['field'] for key,enc in encoding.items()
                     if key not in ['x', 'y']]
        x = encoding['x']['field']
        y = encoding['y']['field']
        yfit = y + '_fit'
        lines.encode(y=yfit)
        if group_cols:
            groups = self.data.groupby(group_cols)
            data = groups.apply(self._add_regression_column, regression_func=func,
                               x=x, y=y, yfit=yfit)
        else:
            data = self._add_regression_column(self.data.copy(),
                                               regression_func=func,
                                               x=x, y=y, yfit=yfit)
            
        return altair.LayeredChart(data).set_layers(points, lines)

In [30]:
from altair import X, Y, Scale

RegressionChart(d).mark_circle().encode(
    X('loglen:Q', scale=Scale(domain=(1.6, 3))), 
    y='log-bcm'
).regression_plot(func=lowess)



In [17]:
from altair import X, Y, Scale

RegressionChart(d).mark_circle().encode(
    X('loglen:Q', scale=Scale(domain=(1.6, 3))), 
    y='log-bcm'
).regression_plot(func=linear_regression)



In [ ]: